# Supplemental Online Resources Improve Political Methods Education
# Marco Alcocer, Leonardo Falabella, Alexandra Lange, Nicholas Smith, and
# Maureen Feeley
# Replication code

# Packages required -------------------------------------------------------

library(AER)
library(here)
library(tidyverse)
library(sandwich)
library(lmtest)
library(stargazer)
library(gridExtra)
library(vtable)

# Read data ---------------------------------------------------------------

dfd <- as_tibble(readRDS(here("replication", 
                              "data.rds")))

# cluster_se function -----------------------------------------------------

# Create cluster_se function to extract standard errors clustered by student for
# each regression model
cluster_se <- function(x) 
  coeftest(x, vcov = vcovCL, cluster = ~ DeID)[, "Std. Error"]

# Wrangle data from student-question to student level --------------------

# Data frame will be used for Tables 5 and Figure 3 (body) and additional
# descriptive plots and tables in the appendix.
pdat <- dfd %>%
  group_by(DeID) %>%
  summarise(
    views = sum(n_views),
    quizzes = sum(quizzes_taken),
    gpa = mean(gpa, na.rm = TRUE),
    mean_score = mean(pct_score)
  ) %>%
  # Did the student view at least one page? Take at least one quiz?
  mutate(viewed_page = factor(ifelse(views > 0, "Yes", "No"),
                              levels = c("Yes", "No")),
         took_quiz = factor(ifelse(quizzes > 0, "Yes", "No"),
                            levels = c("Yes", "No"))) %>% 
  left_join(
    dfd %>%
      distinct(DeID, .keep_all = TRUE) %>%
      select(DeID, urm),
    by = "DeID"
  )

# Table 4 (Main results) -------------------------------------------------

# Estimate regression models
mr_mdls <- list(
  # Model 1: OLS, percent score, access to treatment
  mr_m1 <- lm(pct_score ~ treatment +
                factor(DeID) +
                factor(question),
              data = dfd), 
  # Model 2: IV-2SLS, percent score, page view
  mr_m2 <- ivreg(
    pct_score ~ viewed_page +
      factor(DeID) +
      factor(question) |
      treatment +
      factor(DeID) +
      factor(question),
    data = dfd
  ), 
  # Model 3: IV-2SLS, percent score, quiz completion
  mr_m3 <- ivreg(
    pct_score ~ took_quiz +
      factor(DeID) +
      factor(question) |
      treatment +
      factor(DeID) +
      factor(question),
    data = dfd
  ), 
  # Model 4: OLS, standardized score, access to treatment
  mr_m4 <- lm(std_score ~ treatment +
                factor(DeID),
              data = dfd),
  # Model 5: IV-2SLS, standardized score, page view
  mr_m5 <- ivreg(std_score ~ viewed_page +
                   factor(DeID) |
                   treatment +
                   factor(DeID),
                 data = dfd),
  # Model 6: IV-2SLS, standardized score, quiz completion
  mr_m6 <- ivreg(std_score ~ took_quiz +
                   factor(DeID) |
                   treatment +
                   factor(DeID),
                 data = dfd)
)

# Generate main results table -- LaTeX
stargazer(
  mr_mdls,
  header = F,
  font.size = "small",
  df = F,
  keep = c("treatment", "viewed_page", "took_quiz"),
  # List apply cluster_se function for clustered standard errors
  se = lapply(mr_mdls,
              cluster_se),
  add.lines = list(
    c("Model", rep(c(
      "OLS", rep("IV-2SLS", 2)
    ), 2)),
    c("Student FE", rep("Yes", 6)),
    c("Question FE", rep("Yes", 3), rep("No", 3))
  ),
  column.labels   = c("Percent", "Standardized"),
  column.separate = c(3, 3),
  dep.var.caption = "Question Score",
  omit.stat = c("f", "adj.rsq", "ser"),
  dep.var.labels.include = FALSE,
  covariate.labels = c(
    "Treatment (OSI available)",
    "Compliance (viewed a page)",
    "Compliance (completed a quiz)"
  ),
  column.sep.width = "1pt",
  model.names = FALSE,
  title = "Main results: Effects of OSI on student grades in exam questions. Models 1-3 use question grades in percent as outcome measure. Models 4-6 use standardized question grade as outcome measure. Models 1 and 4 estimate the ITT, models 2 and 4 estimate the LATE using module view as compliance, models 3 and 6 estimate the LATE using quiz taking as compliance.",
  label = "table:main",
  notes = "Standard errors clustered by student in all columns.",
  type = "latex",
  out = here("replication",
             "tables",
             "main_results_table.tex")
)

# Figure 2 (Coefficient plot with main results, pct score) ---------------

# Set up data frame for the plot
tibble(
  # Columns: model, slope, lowerci, upperci
  model = factor(
    c(
      "Online Supplemental\nInstruction Available\n(ITT)",
      "Viewed a Page\n(LATE)",
      "Completed a Quiz\n(LATE)"
    ),
    levels = c(
      "Completed a Quiz\n(LATE)",
      "Viewed a Page\n(LATE)",
      "Online Supplemental\nInstruction Available\n(ITT)"
    )
  ),
  # Get coefficients (slopes)
  slope = c(
    mr_m1$coefficients["treatment"],
    mr_m2$coefficients["viewed_page"],
    mr_m3$coefficients["took_quiz"]
  ),
  # Get lower CIs
  lowerci = c(
    confint(mr_m1, "treatment", level = 0.95)[1],
    confint(mr_m2, "viewed_page", level = 0.95)[1],
    confint(mr_m3, "took_quiz", level = 0.95)[1]
  ),
  # Get upper CIs
  upperci = c(
    confint(mr_m1, "treatment", level = 0.95)[2],
    confint(mr_m2, "viewed_page", level = 0.95)[2],
    confint(mr_m3, "took_quiz", level = 0.95)[2]
  )
) %>%
  # Plot
  ggplot(aes(x = slope, y = model)) +
  geom_point() +
  geom_linerange(aes(xmin = lowerci,
                     xmax = upperci)) +
  scale_x_continuous(limits = c(0, 20)) +
  geom_vline(xintercept = 0,
             linetype = "dashed") +
  theme_bw() +
  labs(y = element_blank(),
       x = "Percentage point increase in question score") +
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12))
# Save plot
ggsave(
  here("replication",
       "figures",
       "main_coef_plot.pdf"),
  width = 13.5,
  height = 7.25,
  units = "cm"
)

# Remove objects
rm(mr_m1, mr_m2, mr_m3,
   mr_m4, mr_m5, mr_m6,
   mr_mdls)

# Table 5 (Heterogeneous treatment effects) ------------------------------

# Estimate regression models
hte_mdls <- list(
  # HTE Model 1: Treatment*GPA, percent score
  hte_m1 <- lm(pct_score ~ treatment * gpa +
                  factor(DeID) +
                  factor(question),
                data = dfd), 
  # HTE Model 2: Treatment*URM, percent score
  hte_m2 <- lm(pct_score ~ treatment * urm +
                  factor(DeID) +
                  factor(question),
                data = dfd), 
  # HTE Model 3: Treatment*GPA, standardized score
  hte_m3  <- lm(std_score ~ treatment * gpa +
                   factor(DeID),
                 data = dfd),
  # HTE Model 4: Treatment*URM, standardized score
  hte_m4 <- lm(std_score ~ treatment * urm +
                  factor(DeID),
                data = dfd)
)

# Generate table - LaTeX
stargazer(
  hte_mdls,
  se = lapply(hte_mdls,
              cluster_se),
  keep = c("treatment",
           "treatment:gpa",
           "treatment:urmURM"),
  covariate.labels = c(
    "Treatment",
    "Treatment $\\times$ GPA",
    "Treatment $\\times$ URM"
  ),
  column.labels   = c("Percent", "Standardized"),
  add.lines = list(c("Student FE", rep("Yes", 4)),
                   c("Question FE", rep("Yes", 2), rep("No", 2))),
  column.separate = c(2, 2),
  dep.var.labels.include = FALSE,
  dep.var.caption = "Question Score",
  omit.stat = c("f", "adj.rsq", "ser"),
  notes = "Standard errors clustered by student in all columns.",
  title = "Heterogeneous treatment effects: treatment-by-covariate effects of OSI access on exam question grades given students' incoming cumulative GPA and underrepresented minority status. Models 1 and 2 use question grade in percent as outcome. Models 3 and 4 use standardized question grade as outcome.",
  label = "table:hte",
  header = F,
  font.size = "small",
  type = "latex",
  out = here("replication",
             "tables",
             "hte_table.tex")
)

# Remove objects
rm(hte_m1, hte_m2, hte_m3, 
   hte_m4, hte_mdls)

# Figure 3 (Incoming GPA and module use) ---------------------------------

# GPA and page views, scatter plot
gpa_view <- ggplot(pdat, aes(x = gpa, y = views)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = lm,
              se = FALSE,
              color = "black") +
  scale_x_continuous(breaks = seq(0, 4, .5)) +
  scale_y_continuous(breaks = seq(0, 140, 20)) +
  theme_bw() +
  labs(x = "Incoming GPA",
       y = "Page views") +
  annotate("text",
           x = 2,
           y = 100,
           label = expression(paste(hat(y), 
                                    " = 7.5 + 2.9x"))) +
  annotate("text",
           x = 1.775,
           y = 85,
           label = expression(paste(R ^ 2, 
                                    " = 0.02"))) + 
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

# Regression summary
summary(lm(views ~ gpa, pdat))

# GPA and quiz completions, scatter plot
gpa_quiz <- ggplot(pdat, aes(x = gpa, y = quizzes)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = lm,
              se = FALSE,
              color = "black") +
  scale_x_continuous(breaks = seq(0, 4, .5)) +
  scale_y_continuous(breaks = 0:8) +
  theme_bw() +
  labs(x = "Incoming GPA",
       y = "Quiz completions") +
  annotate("text",
           x = 2,
           y = 6,
           label = expression(paste(hat(y), 
                                    " = -1.1 + 1.1x"))) +
  annotate("text",
           x = 1.725,
           y = 5,
           label = expression(paste(R ^ 2, 
                                    " = 0.03"))) + 
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

# Regression summary
summary(lm(quizzes ~ gpa, pdat))

# Arrange, save
gpa_use <- grid.arrange(gpa_view, gpa_quiz, ncol = 2)
ggsave(here("replication",
            "figures",
            "scatter_gpa_use.pdf"),
       plot = gpa_use,
       width = 13.5,
       height = 7.25,
       units = "cm")

# Remove plot objects
rm(gpa_view, gpa_use, gpa_quiz)

# Table 3 (Student-level summary stats: GPA, OSI use, exam scores) -------

sumtable(
  pdat[, c("views",
           "quizzes",
           "gpa",
           "mean_score")],
  summ = c(
    'notNA(x)',
    'mean(x)',
    'median(x)',
    'sd(x)',
    'min(x)',
    'pctile(x)[25]',
    'pctile(x)[75]',
    'max(x)'
  ),
  labels = c("Page views", 
             "Quiz completions", 
             "Incoming GPA", 
             "Mean Question Score"),
  out = "csv",
  file = here("replication",
              "tables",
              "summary_student_use.csv")
)

# Remove objects
rm(pdat, dfd, x, cluster_se)